// Copyright 2024 The Google Research Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Molecular graph featurization protos
syntax = "proto2";

package research_gigamol;

// Molecule graphs
// Graph convolutions are built on atoms and atom pairs
// Possible additional considerations:
// * bond stereochemistry: E/Z
//
// NOTE: Several of the enums in this proto do not have a value that is
// 0. We reserve 0 for none/missing values. This means in the conversion to
// one-hot vectors, you have to be a bit careful about when you want to shift
// and when you don't (see molecule_graph_featurizer.{cc,h}).
message MoleculeGraph {
  // Atoms
  message Atom {
    // Atomic number (number of protons)
    optional int32 atomic_num = 1;
    // Atomic symbol (H=Hydrogen, C=Carbon, etc.)
    optional string element = 2;
    // Generic atom type
    enum AtomType {
      ATOM_NONE = 23;
      ATOM_OTHER = 0;
      ATOM_H = 1;
      ATOM_C = 2;
      ATOM_C_AROMATIC = 3;
      ATOM_N = 4;
      ATOM_N_AROMATIC = 5;
      ATOM_O = 6;
      ATOM_O_AROMATIC = 7;
      ATOM_F = 8;
      ATOM_NA = 9;
      ATOM_MG = 10;
      ATOM_SI = 11;
      ATOM_P = 12;
      ATOM_P_AROMATIC = 20;
      ATOM_S = 13;
      ATOM_S_AROMATIC = 14;
      ATOM_CL = 15;
      ATOM_K = 16;
      ATOM_CA = 17;
      ATOM_BR = 18;
      ATOM_I = 19;
      ATOM_GE = 22;
      ATOM_METAL = 21;
      ATOM_B = 24;
      ATOM_SE = 25;
    }
    optional AtomType type = 3;
    // Substituent configurations
    enum ChiralType {
      CHIRAL_NONE = 0;
      CHIRAL_R = 1;
      CHIRAL_S = 2;
    }
    optional ChiralType chirality = 4;
    // Formal atomic charge
    optional int32 formal_charge = 5;
    // Partial atomic charge
    optional float partial_charge = 6;
    // DEPRECATED: Size of the smallest ring that includes this atom
    optional int32 smallest_ring = 7;
    // Sizes of all rings that include this atom
    repeated int32 ring_sizes = 8;
    // Hybridization
    // Note that the zero type will usually be ignored (see note above)
    enum HybridizationType {
      HYBRIDIZATION_OTHER = 0;
      HYBRIDIZATION_S = 6;
      HYBRIDIZATION_SP = 1;
      HYBRIDIZATION_SP2 = 2;
      HYBRIDIZATION_SP3 = 3;
      HYBRIDIZATION_SP3D = 4;
      HYBRIDIZATION_SP3D2 = 5;
    }
    optional HybridizationType hybridization = 9;
    // Hydrogen bonding
    optional bool acceptor = 10;
    optional bool donor = 11;
    // Aromatic
    optional bool aromatic = 12;
    // Moments of the nuclear coulumb potential calculated on spheres centered
    // around the atom.
    repeated float coulumb_sphere_features = 13;
  }
  // Atom pairs, including bond information and distance between atoms
  message AtomPair {
    // Indices of Atom messages described by this pair
    // Atom order is not important, but messages should only be created for one
    // of the two possible orderings.
    optional int32 a_idx = 1;
    optional int32 b_idx = 2;
    // Basic list of bond types.
    enum BondType {
      BOND_NONE = 0;
      BOND_SINGLE = 1;
      BOND_DOUBLE = 2;
      BOND_TRIPLE = 3;
      BOND_AROMATIC = 4;
    }
    optional BondType bond_type = 3;
    // Number of bonds in the shortest path between these atoms on the graph,
    // or 1e8 if there is no path between the pair of atoms
    optional int32 graph_distance = 4;
    // Ring membership; true if atoms are in the same ring
    optional bool same_ring = 5;
    // Euclidean distance between atoms for the standard conformer
    optional float spatial_distance = 6;
    // Coulumb energy between nuclei, atomic_num_1*atomic_num_2/spatial_distance
    optional float coulumb_energy = 7;
  }
  message Label {
    repeated int64 target_class = 1 [packed = true];
    repeated float target_value = 2 [packed = true];
    repeated string target_class_string = 3;
    optional float weight = 4 [default = 1.0];
  }
  repeated Atom atoms = 1;
  repeated AtomPair atom_pairs = 2;
  repeated Label labels = 3;
  // Binary features for this molecule, e.g. circular fingerprint
  repeated bool binary_features = 4;
  // Floating point features for this molecule
  repeated float float_features = 7;
  // SMILES string, if available
  optional string smiles = 5;
  // Total atomic number -- the negation of total electron charge.
  // Use this rather than total number of atoms if you want to plot prediction
  // accuracy vs. some measure of molecule size, since it is more directly
  // related to the chemistry.
  optional int32 tot_atomic_num = 6;
}

// A simple, basically flat description of a molecule. This is intended to be
// used for variance function estimates (i.e. are there any simple descriptions
// of a molecule that are related to the error that some model makes for that
// prediction).
message SimpleMoleculeFeatures {
  // Summary statistics for a distribution of values
  message DistributionSummary {
    optional float min = 1;
    optional float max = 2;
    optional float mean = 3;
    optional float median = 4;
    optional float count = 5;
    optional float std = 6;
  }

  // A count of how many atoms of each element.
  map<string, int32> element_type_counts = 1;
  // Number of atoms not including hydrogen
  optional int32 num_atoms = 13;
  // Number of atoms not including hydrogen
  optional int32 num_heavy_atoms = 2;
  // Whether there are any chiral centers in the molecule.
  optional bool is_chiral = 3;
  // Total number of hydrogen bond acceptor/donors.
  optional int32 num_hbond_acceptor = 4;
  optional int32 num_hbond_donor = 5;
  // Summary of the partial and formal charges.
  optional DistributionSummary partial_charges_distribution = 6;
  optional DistributionSummary formal_charges_distribution = 7;

  // Whether there is any aromatic ring in the molecule
  optional bool is_aromatic = 8;
  // Number of bonds of each type. The key is the enum from
  // MoleculeGraph.AtomPair.BondType but enums can't be keys in maps (sigh).
  map<int32, int32> bond_type_counts = 9;
  // List of all the unique sizes of rings present in the molecule. Note that we
  // don't have the number of distinct rings, but this is kind of a stupid
  // limitation based on our current MoleculeGraph.
  repeated int32 ring_sizes = 10;
  // Summary of all the pairwise graph and spatial distances.
  optional DistributionSummary graph_distances_distribution = 11;
  optional DistributionSummary spatial_distances_distribution = 12;
}
